Classifying Book Genres Based On Review Text Using Machine Learning¶

Jenna Bittner

Import dataframes¶

In [1]:
## IMPORT DATA

# Imports
import pandas as pd

# Load a sample of reviews to handle memory efficiently
df_reviews = pd.read_json(
    '/fs/ess/PAS2038/PHYSICS_5680_OSU/project_data/goodreads/goodreads/complete/goodreads_reviews_dedup.json.gz', 
    nrows=100000, 
    lines=True
)

# Extract book_ids from the reviews
book_ids = set(df_reviews['book_id'].unique())

# Initialize a list to store filtered genre data chunks
filtered_genre_chunks = []

# Process genres data in chunks
for chunk in pd.read_json(
    '/fs/ess/PAS2038/PHYSICS_5680_OSU/project_data/goodreads/goodreads/complete/goodreads_book_genres_initial.json.gz', 
    lines=True, 
    chunksize=10000
):
    # Filter for rows with matching book_ids and non-empty genres
    filtered_chunk = chunk[chunk['book_id'].isin(book_ids) & chunk['genres'].notna() & (chunk['genres'].str.len() > 0)]
    
    # Filter to make sure that the first genre is not 'fiction' unless a second genre exists
    filtered_chunk = filtered_chunk[filtered_chunk['genres'].apply(lambda x: len(x) > 1 if 'fiction' in x else True)]
    
    filtered_genre_chunks.append(filtered_chunk)

# Combine all filtered chunks into one dataframe
df_genres = pd.concat(filtered_genre_chunks, ignore_index=True)

# Merge reviews and genres on book_id
df = pd.merge(df_reviews, df_genres, on='book_id', how='inner')

# Check the shape and missing data of the final merged dataframe
print('Shape of merged dataframe:', df.shape)
print()
print('Missing data summary:\n', df.isnull().sum())
print()
print('Number of rows with null genres:', df_genres['genres'].isnull().sum())
Shape of merged dataframe: (98050, 12)

Missing data summary:
 user_id         0
book_id         0
review_id       0
rating          0
review_text     0
date_added      0
date_updated    0
read_at         0
started_at      0
n_votes         0
n_comments      0
genres          0
dtype: int64

Number of rows with null genres: 0

Explore data¶

In [2]:
df.columns
Out[2]:
Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments', 'genres'],
      dtype='object')
In [3]:
df.head()
Out[3]:
user_id book_id review_id rating review_text date_added date_updated read_at started_at n_votes n_comments genres
0 8842281e1d1347389f2ab93d60773d4d 24375664 5cd416f3efc3f944fce4ce2db2290d5e 5 Mind blowingly cool. Best science fiction I've... Fri Aug 25 13:55:02 -0700 2017 Mon Oct 09 08:55:59 -0700 2017 Sat Oct 07 00:00:00 -0700 2017 Sat Aug 26 00:00:00 -0700 2017 16 0 {'fiction': 1059, 'fantasy, paranormal': 114}
1 df889690c61e3d0ced39614a3b4a07c1 24375664 46301875c07d309c0f53cfcbfd247196 4 It took me 2 months to read the first 200 page... Tue May 24 13:50:59 -0700 2016 Mon Oct 03 12:03:43 -0700 2016 Mon Oct 03 12:03:43 -0700 2016 Mon Aug 08 21:05:00 -0700 2016 0 0 {'fiction': 1059, 'fantasy, paranormal': 114}
2 dd39f0a2f72aeb2d44c59f3e9f1ddda2 24375664 1c6d353e3054c9485b037f06e8a00570 3 3.5/5.0 Sat Jul 02 03:09:16 -0700 2016 Sat Jul 02 03:10:16 -0700 2016 Sat Jul 02 03:10:16 -0700 2016 0 0 {'fiction': 1059, 'fantasy, paranormal': 114}
3 8842281e1d1347389f2ab93d60773d4d 18245960 dfdbb7b0eb5a7e4c26d59a937e2e5feb 5 This is a special book. It started slow for ab... Sun Jul 30 07:44:10 -0700 2017 Wed Aug 30 00:00:26 -0700 2017 Sat Aug 26 12:05:52 -0700 2017 Tue Aug 15 13:23:18 -0700 2017 28 1 {'fiction': 393, 'fantasy, paranormal': 341, '...
4 df889690c61e3d0ced39614a3b4a07c1 18245960 198f663bbbb2926f8ad723796eef23ed 0 If you like Contact, you'll love this \n This ... Tue May 24 13:54:43 -0700 2016 Tue May 24 13:54:43 -0700 2016 2 0 {'fiction': 393, 'fantasy, paranormal': 341, '...

Make some plots¶

In [4]:
## Distribution of genres

# Imports
import plotly.express as px

# Define a function to get the first genre
def get_first_genre(genre_dict):
    if isinstance(genre_dict, dict) and genre_dict:  # Check if it's a non-empty dictionary
        return next(iter(genre_dict.keys()))  # Get the first key (genre)
    return None  # Return None if it's not a dictionary or is empty

# Apply the function to create the primary genre column
df['primary_genre'] = df['genres'].apply(get_first_genre)

# Count of primary genres
genre_counts = df['primary_genre'].value_counts().reset_index()
genre_counts.columns = ['primary_genre', 'count']

# Plot the top genres
fig = px.bar(
    genre_counts.head(10), 
    x='primary_genre', 
    y='count', 
    title='Top 10 Genres in Dataset',
    labels={'primary_genre': 'Genre', 'count': 'Count'},
    color='count',
    color_continuous_scale='Viridis'
)

# Update layout
fig.update_layout(
    xaxis_title='Genre', 
    yaxis_title='Count', 
    xaxis_tickangle=-45,
    width=900,  # Increase figure width
    height=500, # Increase figure height
    margin=dict(l=60, r=60, t=60, b=100)  # Adjust margins for more spacing
)

fig.show()
fictionromancefantasy, paranormalnon-fictionyoung-adultmystery, thriller, crimecomics, graphichistory, historical fiction, biographychildrenpoetry05k10k15k20k
5k10k15k20kCountTop 10 Genres in DatasetGenreCount
plotly-logomark
In [5]:
## Distribution of secondary genres

# Define a function to get the second genre if it exists
def get_second_genre(genre_dict):
    if isinstance(genre_dict, dict) and len(genre_dict) > 1:  # Check if it's a dictionary with at least two items
        return list(genre_dict.keys())[1]  # Get the second key (subgenre)
    return None  # Return None if it's not a dictionary or doesn't have a second genre

# Apply the function to create the subgenre column
df['secondary_genre'] = df['genres'].apply(get_second_genre)

# Count of subgenres
secondary_genre_counts = df['secondary_genre'].value_counts().reset_index()
secondary_genre_counts.columns = ['secondary_genre', 'count']

# Plot the top subgenres
fig = px.bar(
    secondary_genre_counts.head(10), 
    x='secondary_genre', 
    y='count', 
    title='Top 10 Subgenres in Dataset',
    labels={'secondary_genre': 'Secondary Genre', 'count': 'Count'},
    color='count',
    color_continuous_scale='Viridis'
)

# Update layout
fig.update_layout(
    xaxis_title='Secondary Genre', 
    yaxis_title='Count', 
    xaxis_tickangle=-45,
    width=900,  
    height=500, 
    margin=dict(l=60, r=60, t=60, b=100) 
)

fig.show()
fictionhistory, historical fiction, biographyromancefantasy, paranormalyoung-adultmystery, thriller, crimenon-fictionchildrencomics, graphicpoetry05k10k15k20k25k30k
10k20k30kCountTop 10 Subgenres in DatasetSecondary GenreCount
plotly-logomark
In [6]:
## Distribution of third genres, if they exist

# Function to get the third genre, if it exists
def get_third_genre(genre_dict):
    if isinstance(genre_dict, dict) and len(genre_dict) >= 3:  # Check if it's a dictionary with at least 3 genres
        return list(genre_dict.keys())[2]  # Get the third key (genre)
    return None  # Return None if there are fewer than 3 genres

# Apply the function to create the third genre column
df['third_genre'] = df['genres'].apply(get_third_genre)

# Count of third genres
third_genre_counts = df['third_genre'].value_counts().reset_index()
third_genre_counts.columns = ['third_genre', 'count']

# Plot the top third genres
fig = px.bar(third_genre_counts.head(10), 
             x='third_genre', 
             y='count', 
             title='Top 10 Third Genres in Dataset',
             labels={'third_genre': 'Third Genre', 'count': 'Count'},
             color='count',
             color_continuous_scale='Viridis')

# Update layout
fig.update_layout(
    xaxis_title='Third Genre', 
    yaxis_title='Count', 
    xaxis_tickangle=-45,
    width=900,  
    height=500,
    margin=dict(l=60, r=60, t=60, b=100)  
)

fig.show()
fictionromancefantasy, paranormalyoung-adultmystery, thriller, crimehistory, historical fiction, biographychildrencomics, graphicnon-fictionpoetry05k10k15k20k
5k10k15kCountTop 10 Third Genres in DatasetThird GenreCount
plotly-logomark
In [7]:
## Average rating for each genre

# Calculate average rating for each genre
avg_rating_by_genre = df.groupby('primary_genre')['rating'].mean().reset_index().sort_values(by='rating', ascending=False)

# Plot average rating by genre
fig = px.bar(avg_rating_by_genre.head(10), 
             x='primary_genre', 
             y='rating', 
             title='Top 10 Genres by Average Rating',
             labels={'primary_genre': 'Genre', 'rating': 'Average Rating'},
             color='rating',
             color_continuous_scale='purp')  

# Update layout
fig.update_layout(
    xaxis_title='Genre', 
    yaxis_title='Average Rating', 
    xaxis_tickangle=-45,
    width=800, 
    height=500,  
    margin=dict(l=40, r=40, t=60, b=100) 
)

fig.show()
poetryfantasy, paranormalromancechildrencomics, graphicyoung-adulthistory, historical fiction, biographymystery, thriller, crimefictionnon-fiction01234
3.73.83.9Average RatingTop 10 Genres by Average RatingGenreAverage Rating
plotly-logomark
In [8]:
## Distribution of ratings

# Plot distribution of ratings
fig = px.histogram(df, 
                   x='rating', 
                   nbins=10, 
                   title='Distribution of Ratings',
                   labels={'rating': 'Rating'},
                   color_discrete_sequence=['skyblue'])

# Update traces 
fig.update_traces(opacity=0.85)  

# Update layout
fig.update_layout(
    xaxis_title='Rating', 
    yaxis_title='Count',
    width=800,  
    height=500, 
    title_x=0.5, 
    xaxis=dict(gridcolor='lightgray'),  
    yaxis=dict(gridcolor='lightgray'),
    margin=dict(l=40, r=40, t=60, b=100)  
)

fig.show()
01234505k10k15k20k25k30k35k
Distribution of RatingsRatingCount
plotly-logomark
In [9]:
## Average length of review by genre

# Add a column for word count in reviews
df['review_word_count'] = df['review_text'].apply(lambda x: len(str(x).split()))

# Plot average review word count by genre
avg_word_count_by_genre = df.groupby('primary_genre')['review_word_count'].mean().reset_index().sort_values(by='review_word_count', ascending=False)
fig = px.bar(avg_word_count_by_genre.head(10), 
             x='primary_genre', 
             y='review_word_count', 
             title='Average Review Word Count by Genre',
             labels={'primary_genre': 'Genre', 'review_word_count': 'Average Word Count'},
             color='review_word_count',
             color_continuous_scale='Sunsetdark')

# Update layout 
fig.update_traces(opacity=0.85) 
fig.update_layout(
    xaxis_title='Genre', 
    yaxis_title='Average Word Count',
    title_x=0.5,  
    width=800,  
    height=500,
    xaxis=dict(tickangle=-45, title_standoff=10),  
    margin=dict(l=40, r=40, t=60, b=100)  
)

fig.show()
history, historical fiction, biographyromanceyoung-adultfantasy, paranormalfictionnon-fictionmystery, thriller, crimecomics, graphicpoetrychildren050100150
80100120140160Average Word CountAverage Review Word Count by GenreGenreAverage Word Count
plotly-logomark
In [10]:
# ## Average rating over time

# # Imports
# import time

# # Convert 'date_added' to datetime and handle errors by setting invalid values to NaT
# df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# # Check for any invalid dates (NaT)
# invalid_dates = df[df['date_added'].isna()]

# # Drop rows where 'date_added' is NaT (invalid dates)
# df = df.dropna(subset=['date_added'])

# # Extract the year from 'date_added'
# df['year_added'] = df['date_added'].dt.year

# # Calculate average rating by year
# avg_rating_by_year = df.groupby('year_added')['rating'].mean().reset_index()

# # Create line plot for average rating over time
# fig = px.line(avg_rating_by_year, 
#               x='year_added', 
#               y='rating', 
#               title='Average Rating Over Time',
#               labels={'year_added': 'Year', 'rating': 'Average Rating'},
#               markers=True)

# fig.update_layout(xaxis_title='Year', yaxis_title='Average Rating')
# fig.show()

Prepare target data¶

In [11]:
## REPLACE THE FIRST GENRE 'FICTION' WITH ITS SECONDARY GENRE

# Imports
from collections import Counter

# Count the number of books where the primary genre is 'fiction' and the secondary genre is None (or empty)
num_fiction_with_no_secondary_genre = df[(df['primary_genre'] == 'fiction') & 
                                          (df['secondary_genre'].isna() | (df['secondary_genre'] == ''))].shape[0]
print(f"Number of 'fiction' books with no secondary genre: {num_fiction_with_no_secondary_genre}")
print()

# Replace the primary genre with the secondary genre where the primary genre is 'fiction'
df.loc[df['primary_genre'] == 'fiction', 'primary_genre'] = df['secondary_genre']

# Check new genres
genre_counts = Counter(df['primary_genre'])
for genre, count in genre_counts.items():
    print(f"{genre}: {count}")
print()

# Check for None or NaN values in the primary_genre column
nan_rows = df[df['primary_genre'].isnull()]
print(f"Number of rows with missing primary genre: {nan_rows.shape[0]}")
Number of 'fiction' books with no secondary genre: 0

fantasy, paranormal: 18764
mystery, thriller, crime: 11672
history, historical fiction, biography: 12796
non-fiction: 11715
children: 4128
young-adult: 11254
romance: 20321
comics, graphic: 6369
poetry: 1031

Number of rows with missing primary genre: 0

Data preprocessing¶

In [12]:
## CLEAN REVIEW DATA

# Imports
import nltk
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# Define the stop words set
stop_words = set(stopwords.words('english'))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_review(review_text):
    
    # Lowercase and tokenize
    tokens = word_tokenize(review_text.lower())
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Remove non-alphanumeric characters
    tokens = [re.sub(r'\W+', '', token) for token in tokens if re.sub(r'\W+', '', token) != '']
    
    # Remove numbers
    tokens = [token for token in tokens if not token.isdigit()]
    
    # Remove short words (length <= 2)
    tokens = [token for token in tokens if len(token) > 2]
    
    return tokens

# Apply the preprocessing function to the reviews
df['final_reviews'] = df['review_text'].apply(preprocess_review)

# Drop duplicates
df = df.drop_duplicates(subset=['final_reviews'])

# View a result
print(df.final_reviews[0], '\n')
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/PAS2038/bittner87/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/PAS2038/bittner87/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/PAS2038/bittner87/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /users/PAS2038/bittner87/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['mind', 'blowingly', 'cool', 'best', 'science', 'fiction', 'read', 'time', 'loved', 'description', 'society', 'future', 'lived', 'tree', 'notion', 'owning', 'property', 'even', 'getting', 'married', 'gone', 'every', 'surface', 'screen', 'undulation', 'society', 'responds', 'trisolaran', 'threat', 'seem', 'surprising', 'maybe', 'chinese', 'perspective', 'would', 'thought', 'eto', 'would', 'exist', 'book', 'would', 'thought', 'people', 'would', 'get', 'overconfident', 'primitive', 'fleet', 'chance', 'given', 'think', 'superior', 'science', 'would', 'weapon', 'defense', 'would', 'rifle', 'arrow', 'moment', 'luo', 'wallfacer', 'cool', 'may', 'actually', 'done', 'fist', 'pump', 'though', 'way', 'dark', 'forest', 'theory', 'right', 'see', 'reason', 'would', 'society', 'probably', 'stop', 'broadcasting', 'much', 'signal', 'universe'] 

Feature engineering¶

In [13]:
## TF-IDF VECTORIZATION

# Imports
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Set a limit on the number of features

# Apply TF-IDF vectorization to final_reviews
df['review_text_str'] = df['final_reviews'].apply(lambda x: ' '.join(x))

# Fit and transform the reviews
tfidf_matrix = vectorizer.fit_transform(df['review_text_str'])

# Convert the TF-IDF matrix to a df for inspection
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print('Shape of the TF-IDF matrix:', tfidf_matrix.shape)
print()
print(f'TF-IDF DataFrame head:\n{tfidf_df.head()}')
Shape of the TF-IDF matrix: (94693, 5000)

TF-IDF DataFrame head:
   1960s  1970s  19th  1st  20th  2nd       3rd  4th  5th  aaron  ...  \
0    0.0    0.0   0.0  0.0   0.0  0.0  0.000000  0.0  0.0    0.0  ...   
1    0.0    0.0   0.0  0.0   0.0  0.0  0.136481  0.0  0.0    0.0  ...   
2    0.0    0.0   0.0  0.0   0.0  0.0  0.000000  0.0  0.0    0.0  ...   
3    0.0    0.0   0.0  0.0   0.0  0.0  0.000000  0.0  0.0    0.0  ...   
4    0.0    0.0   0.0  0.0   0.0  0.0  0.000000  0.0  0.0    0.0  ...   

   youngest  youth  youtube  yummy  zach  zane  zero  zoe  zombie  zone  
0       0.0    0.0      0.0    0.0   0.0   0.0   0.0  0.0     0.0   0.0  
1       0.0    0.0      0.0    0.0   0.0   0.0   0.0  0.0     0.0   0.0  
2       0.0    0.0      0.0    0.0   0.0   0.0   0.0  0.0     0.0   0.0  
3       0.0    0.0      0.0    0.0   0.0   0.0   0.0  0.0     0.0   0.0  
4       0.0    0.0      0.0    0.0   0.0   0.0   0.0  0.0     0.0   0.0  

[5 rows x 5000 columns]
In [14]:
## SENTIMENT ANALYSIS

# Imports
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Get sentiment score
def get_sentiment_score(tokens):
    # Check if tokens is empty or None
    if not tokens or tokens is None:
        return np.nan  # Return NaN if no valid tokens
    # Join tokens back into a string for sentiment analysis    
    review_text = ' '.join(tokens)
    sentiment_score = sia.polarity_scores(review_text)
    return sentiment_score['compound']

# Apply sentiment analysis to the 'final_reviews' column
df['sentiment_score'] = df['final_reviews'].apply(get_sentiment_score)

# Verify the results
print(df[['final_reviews', 'sentiment_score']].head())
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /users/PAS2038/bittner87/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
                                       final_reviews  sentiment_score
0  [mind, blowingly, cool, best, science, fiction...           0.9186
1  [took, month, read, first, page, book, took, l...           0.9738
2                                                 []              NaN
3  [special, book, started, slow, first, third, m...           0.9764
4  [like, contact, love, book, unfolds, like, mys...           0.8481

Graphs of sentiment¶

In [15]:
## Distribution of sentiment scores

# Plot histogram of sentiment scores
fig = px.histogram(df, x='sentiment_score', nbins=30, title='Distribution of Sentiment Scores', 
                   labels={'sentiment_score': 'Sentiment Score'})
fig.update_layout(xaxis_title='Sentiment Score', yaxis_title='Frequency')
fig.show()
−1−0.500.5105k10k15k20k25k
Distribution of Sentiment ScoresSentiment ScoreFrequency
plotly-logomark
In [16]:
## Average sentiment score per primary genre

# Calculate average sentiment score for each genre
avg_sentiment_by_genre = df.groupby('primary_genre')['sentiment_score'].mean().reset_index()

# Bar chart for average sentiment by genre
fig = px.bar(avg_sentiment_by_genre, x='primary_genre', y='sentiment_score', 
             title='Average Sentiment Score by Genre',
             labels={'primary_genre': 'Primary Genre', 'sentiment_score': 'Average Sentiment Score'}, 
             color='sentiment_score', 
             color_continuous_scale='tealgrn')

# Update layout
fig.update_traces(opacity=0.85) 
fig.update_layout(
    xaxis_title='Primary Genre', 
    yaxis_title='Average Sentiment Score',
    title_x=0.5,
    width=800,  
    height=500,  
    xaxis=dict(tickangle=-45, title_standoff=10),  
    margin=dict(l=40, r=40, t=60, b=100)  
)

fig.show()
childrencomics, graphicfantasy, paranormalhistory, historical fiction, biographymystery, thriller, crimenon-fictionpoetryromanceyoung-adult00.20.40.6
0.450.50.550.60.65Average Sentiment ScoreAverage Sentiment Score by GenrePrimary GenreAverage Sentiment Score
plotly-logomark
In [17]:
## Sentiment vs review length

# Calculate review length
df['review_length'] = df['final_reviews'].apply(len)

# Scatter plot of sentiment vs. review length
fig = px.scatter(df, x='review_length', y='sentiment_score', 
                 title='Sentiment Score vs. Review Length',
                 labels={'review_length': 'Review Length', 'sentiment_score': 'Sentiment Score'},
                 opacity=0.6,
                 color='sentiment_score',
                 color_continuous_scale='spectral')

# Update layout for better readability
fig.update_traces(opacity=0.85)  
fig.update_layout(
    xaxis_title='Review Length', 
    yaxis_title='Sentiment Score',
    title_x=0.5,  
    width=800,  
    height=500,  
    xaxis=dict(tickangle=-45, title_standoff=10),  
    margin=dict(l=40, r=40, t=60, b=100)  
)

fig.show()
050010001500−1−0.500.51
−0.500.5Sentiment ScoreSentiment Score vs. Review LengthReview LengthSentiment Score
plotly-logomark
In [18]:
# ## Sentiment over time

# # Ensure 'date_added' is a datetime object
# df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# # Drop rows with invalid 'date_added' values
# df = df.dropna(subset=['date_added'])

# # Extract the year for grouping
# df['year_added'] = df['date_added'].dt.year

# # Calculate average sentiment over time (by year)
# avg_sentiment_by_year = df.groupby('year_added')['sentiment_score'].mean().reset_index()

# # Line chart of sentiment over time 
# fig = px.line(avg_sentiment_by_year, x='year_added', y='sentiment_score', 
#               title='Average Sentiment Score Over Time',
#               labels={'year_added': 'Year', 'sentiment_score': 'Average Sentiment Score'}, 
#               markers=True)
# fig.update_layout(xaxis_title='Year', yaxis_title='Average Sentiment Score')
# fig.show()

Model¶

In [19]:
## DEFINE, ENCODE, SPLIT

# Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Define X and y
X = tfidf_matrix  # vectorized reviews
y = df['primary_genre']  # target: primary genre

# Encode the target labels (genres)
le = LabelEncoder()
y = le.fit_transform(y)

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [20]:
## SMOTE - Synthetic minority over-sampling

# Imports 
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

# Apply SMOTE to the training set
smote = SMOTE(sampling_strategy='auto', k_neighbors=3, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
X_train, y_train = X_train_smote, y_train_smote
In [21]:
## CREATE AND TRAIN MODEL

# Imports
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(
    n_estimators=500,       # Number of trees
    max_depth=None,          # Allow trees to grow as deep as they want
    min_samples_split=2,     # Minimum samples to split
    min_samples_leaf=4,      # Minimum samples at leaf
    max_features='log2',     # Number of features to consider at each split
    bootstrap=True,          # Use bootstrap sampling
    criterion='gini',       # Quality of split
    random_state=42          # For reproducibility
)

# Train the model
model.fit(X_train, y_train)
Out[21]:
RandomForestClassifier(max_features='log2', min_samples_leaf=4,
                       n_estimators=500, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_features='log2', min_samples_leaf=4,
                       n_estimators=500, random_state=42)

Evaluate model¶

In [29]:
## CLASSIFICATION REPORT AND CONFUSION MATRIX

# Imports
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test, y_pred))
print()
print('Classification Report:\n', classification_report(y_test, y_pred, target_names=le.classes_))

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a confusion matrix display
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)

# Customize the plot
fig, ax = plt.subplots(figsize=(8,8))
disp.plot(ax=ax, xticks_rotation='vertical')

# Show the plot
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()
Accuracy: 0.5065209356354612

Classification Report:
                                         precision    recall  f1-score   support

                              children       0.45      0.68      0.54       819
                       comics, graphic       0.56      0.75      0.64      1212
                   fantasy, paranormal       0.51      0.42      0.46      3598
history, historical fiction, biography       0.52      0.33      0.40      2478
              mystery, thriller, crime       0.49      0.46      0.47      2181
                           non-fiction       0.53      0.60      0.56      2345
                                poetry       0.14      0.66      0.23       191
                               romance       0.57      0.65      0.61      3926
                           young-adult       0.46      0.31      0.37      2189

                              accuracy                           0.51     18939
                             macro avg       0.47      0.54      0.48     18939
                          weighted avg       0.52      0.51      0.50     18939

In [23]:
## ROC Curve

# Imports
from sklearn.metrics import roc_curve, auc

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1], pos_label=1)

# Compute AUC
roc_auc = auc(fpr, tpr)

# Plot ROC curve
fig_roc = px.area(x=fpr, y=tpr, 
                  title=f'ROC Curve (AUC = {roc_auc:.2f})', 
                  labels={'x': 'False Positive Rate', 'y': 'True Positive Rate'},
                  height=400)
fig_roc.update_traces(fillcolor="rgba(157, 211, 144, 0.4)", 
                      line_color="rgba(157, 211, 144, 1)", 
                      opacity=0.4)


# Add random classifier
fig_roc.add_scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Classifier', line=dict(dash='dash', color='rgb(252, 115, 212)'))

fig_roc.show()
00.20.40.60.8100.20.40.60.81
Random ClassifierROC Curve (AUC = 0.93)False Positive RateTrue Positive Rate
plotly-logomark
In [35]:
## Precision Recall Curve

# Imports
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import label_binarize
import plotly.express as px

# Binarize the labels for multiclass precision-recall computation
y_test_bin = label_binarize(y_test, classes=range(len(le.classes_)))

# Initialize the plot
fig_pr = px.area(title='Precision-Recall Curve', 
                 labels={'x': 'Recall', 'y': 'Precision'}, 
                 height=600)

# Loop through each class to calculate precision, recall, and thresholds
for i in range(y_test_bin.shape[1]):  # Iterate over each class
    precision, recall, _ = precision_recall_curve(y_test_bin[:, i], model.predict_proba(X_test)[:, i])
    
    # Add each class curve to the plot
    fig_pr.add_scatter(x=recall, y=precision, mode='lines', name=f'{le.classes_[i]}')

# Update axis labels 
fig_pr.update_layout(
    xaxis_title='Recall',
    yaxis_title='Precision',
    height=600
)

fig_pr.show()
00.20.40.60.8100.20.40.60.81
childrencomics, graphicfantasy, paranormalhistory, historical fiction, biographymystery, thriller, crimenon-fictionpoetryromanceyoung-adultPrecision-Recall CurveRecallPrecision
plotly-logomark
In [25]:
## Feature Importance

# Get the feature importances
importances = model.feature_importances_

# Get the feature names (these correspond to the TF-IDF features)
features = vectorizer.get_feature_names_out()

# Create a df for feature importance
feat_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})

# Sort the features by importance
feat_importance_df = feat_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
fig_feat_importance = px.bar(feat_importance_df.head(20), 
                             x='Feature', 
                             y='Importance', 
                             title='Top 20 Features', 
                             labels={'Feature': 'Feature', 'Importance': 'Importance'},
                             height=400,
                             color='Importance',
                             color_continuous_scale='tealgrn',  
                             range_color=[0, max(feat_importance_df['Importance'])]
                             )

# Update layourt
fig_feat_importance.update_layout(
    xaxis_tickangle=45,  
    xaxis={'tickfont': {'size': 12}},  
    yaxis_title='Importance', 
    yaxis={'tickfont': {'size': 14}}, 
    coloraxis_colorbar=dict(title='Importance'),  
    title={'font': {'size': 16}}, 
    showlegend=False  
)

fig_feat_importance.show()
poetrypoemstorytimescomicvolumepreschoolbookillustrationcharacterchildtoddlermysteryartseriesstorykidcollectionissuegraphiclove00.0050.010.0150.02
00.0050.010.0150.02ImportanceTop 20 FeaturesFeatureImportance
plotly-logomark
In [ ]: